{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Guest House - Medium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading spark-stubs, spark-hive\n",
      "Adding Hive conf dir /opt/hive/conf to classpath\n",
      "Creating SparkSession\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "SLF4J: No SLF4J providers were found.\n",
      "SLF4J: Defaulting to no-operation (NOP) logger implementation\n",
      "SLF4J: See https://www.slf4j.org/codes.html#noProviders for further details.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a target=\"_blank\" href=\"http://jupyter:4040\">Spark UI</a>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36m$ivy.$                                  \n",
       "\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.log4j.{Level, Logger}\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.types._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.expressions.Window\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@45e61d04"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import $ivy.`org.apache.spark::spark-sql:3.4.0`\n",
    "\n",
    "import org.apache.log4j.{Level, Logger}\n",
    "Logger.getLogger(\"org\").setLevel(Level.OFF)\n",
    "\n",
    "import org.apache.spark._\n",
    "import org.apache.spark.sql._\n",
    "import org.apache.spark.sql.types._\n",
    "import org.apache.spark.sql.functions._\n",
    "import org.apache.spark.sql.expressions.Window\n",
    "\n",
    "val spark = {\n",
    "    NotebookSparkSession.builder()\n",
    "    .progress(false)\n",
    "    .appName(\"app13-2\")\n",
    "    // .master(\"spark://192.168.31.31:7077\")\n",
    "    .master(\"local[*]\")\n",
    "    .config(\"spark.sql.warehouse.dir\", \n",
    "            \"hdfs://192.168.31.31:9000/user/hive/warehouse\") \n",
    "    .config(\"spark.cores.max\", \"4\") \n",
    "    .config(\"spark.executor.instances\", \"1\") \n",
    "    .config(\"spark.executor.cores\", \"2\") \n",
    "    .config(\"spark.executor.memory\", \"10g\") \n",
    "    .config(\"spark.shuffle.service.enabled\", \"false\") \n",
    "    .config(\"spark.dynamicAllocation.enabled\", \"false\") \n",
    "    .config(\"spark.sql.catalogImplementation\", \"hive\")\n",
    "    .config(\"spark.sql.repl.eagerEval.enabled\", \"true\")\n",
    "    .config(\"spark.driver.allowMultipleContexts\", \"true\")\n",
    "    .getOrCreate()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36mspark.implicits._\n",
       "\u001b[39m\n",
       "defined \u001b[32mfunction\u001b[39m \u001b[36msc\u001b[39m\n",
       "\u001b[36mhiveCxt\u001b[39m: \u001b[32msql\u001b[39m.\u001b[32mhive\u001b[39m.\u001b[32mHiveContext\u001b[39m = org.apache.spark.sql.hive.HiveContext@48a37a1d"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import spark.implicits._\n",
    "def sc = spark.sparkContext\n",
    "val hiveCxt = new org.apache.spark.sql.hive.HiveContext(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defined \u001b[32mclass\u001b[39m \u001b[36mRichDF\u001b[39m"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Credit to Aivean\n",
    "implicit class RichDF(val ds:DataFrame) {\n",
    "    def showHTML(limit: Int = 50, truncate: Int = 100) = {\n",
    "        import xml.Utility.escape\n",
    "        val data = ds.take(limit)\n",
    "        val header = ds.schema.fieldNames.toSeq        \n",
    "        val rows: Seq[Seq[String]] = data.map { row =>\n",
    "          row.toSeq.map {cell =>\n",
    "            val str = cell match {\n",
    "              case null => \"null\"\n",
    "              case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n",
    "              case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n",
    "              case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n",
    "              case _ => cell.toString\n",
    "            }\n",
    "            if (truncate > 0 && str.length > truncate) {\n",
    "              // do not show ellipses for strings shorter than 4 characters.\n",
    "              if (truncate < 4) str.substring(0, truncate)\n",
    "              else str.substring(0, truncate - 3) + \"...\"\n",
    "            } else {\n",
    "              str\n",
    "            }\n",
    "          }: Seq[String]\n",
    "        }\n",
    "    publish.html(s\"\"\" <table>\n",
    "                <tr>\n",
    "                 ${header.map(h => s\"<th>${escape(h)}</th>\").mkString}\n",
    "                </tr>\n",
    "                ${rows.map {row =>\n",
    "                  s\"<tr>${row.map{c => s\"<td>${escape(c)}</td>\" }.mkString}</tr>\"\n",
    "                }.mkString}\n",
    "            </table>\n",
    "        \"\"\")\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[36mbooking\u001b[39m: \u001b[32mDataFrame\u001b[39m = [booking_id: int, booking_date: string ... 6 more fields]\n",
       "\u001b[36mguest\u001b[39m: \u001b[32mDataFrame\u001b[39m = [id: int, first_name: string ... 2 more fields]\n",
       "\u001b[36mroom\u001b[39m: \u001b[32mDataFrame\u001b[39m = [id: int, room_type: string ... 1 more field]\n",
       "\u001b[36mrate\u001b[39m: \u001b[32mDataFrame\u001b[39m = [room_type: string, occupancy: int ... 1 more field]\n",
       "\u001b[36mextra\u001b[39m: \u001b[32mDataFrame\u001b[39m = [extra_id: int, booking_id: int ... 2 more fields]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val booking = hiveCxt.table(\"sqlzoo.booking\")\n",
    "val guest = hiveCxt.table(\"sqlzoo.guest\")\n",
    "val room = hiveCxt.table(\"sqlzoo.room\")\n",
    "val rate = hiveCxt.table(\"sqlzoo.rate\")\n",
    "val extra = hiveCxt.table(\"sqlzoo.extra\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.\n",
    "Ruth Cadbury. Show the total amount payable by guest Ruth Cadbury for her room bookings. You should JOIN to the rate table using room_type_requested and occupants.\n",
    "\n",
    "```\n",
    "+--------------------+\n",
    "| SUM(nights*amount) |\n",
    "+--------------------+\n",
    "|             552.00 |\n",
    "+--------------------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>SUM(nights*amount)</th>\n",
       "                </tr>\n",
       "                <tr><td>552.0</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(guest.filter((col(\"first_name\")===\"Ruth\") && \n",
    "              (col(\"last_name\")===\"Cadbury\"))\n",
    " .join(booking, (guest(\"id\")===booking(\"guest_id\")))\n",
    " .join(rate, (col(\"room_type_requested\")===rate(\"room_type\")) && \n",
    "             (col(\"occupants\")===rate(\"occupancy\")))\n",
    " .groupBy()\n",
    " .agg(sum(col(\"nights\") * col(\"amount\")).alias(\"SUM(nights*amount)\"))\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7.\n",
    "Including Extras. Calculate the total bill for booking 5346 including extras.\n",
    "\n",
    "```\n",
    "+-------------+\n",
    "| SUM(amount) |\n",
    "+-------------+\n",
    "|      118.56 |\n",
    "+-------------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>SUM(amount)</th>\n",
       "                </tr>\n",
       "                <tr><td>118.56</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(booking.filter(col(\"booking_id\")===5346)\n",
    " .join(rate, (booking(\"occupants\")===rate(\"occupancy\")) && \n",
    "             (booking(\"room_type_requested\")===rate(\"room_type\")))\n",
    " .union(booking.filter(col(\"booking_id\")===5346)\n",
    "        .join(extra, \"booking_id\"))\n",
    " .groupBy()\n",
    " .agg(sum(\"amount\").alias(\"SUM(amount)\"))\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8.\n",
    "Edinburgh Residents. For every guest who has the word “Edinburgh” in their address show the total number of nights booked. Be sure to include 0 for those guests who have never had a booking. Show last name, first name, address and number of nights. Order by last name then first name.\n",
    "\n",
    "```\n",
    "+-----------+------------+---------------------------+--------+\n",
    "| last_name | first_name | address                   | nights |\n",
    "+-----------+------------+---------------------------+--------+\n",
    "| Brock     | Deidre     | Edinburgh North and Leith |      0 |\n",
    "| Cherry    | Joanna     | Edinburgh South West      |      0 |\n",
    "| Murray    | Ian        | Edinburgh South           |     13 |\n",
    "| Sheppard  | Tommy      | Edinburgh East            |      0 |\n",
    "| Thomson   | Michelle   | Edinburgh West            |      3 |\n",
    "+-----------+------------+---------------------------+--------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>id</th><th>last_name</th><th>first_name</th><th>address</th><th>sum(nights)</th>\n",
       "                </tr>\n",
       "                <tr><td>1591</td><td>Brock</td><td>Deidre</td><td>Edinburgh North and Leith</td><td>0</td></tr><tr><td>1599</td><td>Cherry</td><td>Joanna</td><td>Edinburgh South West</td><td>0</td></tr><tr><td>1027</td><td>Murray</td><td>Ian</td><td>Edinburgh South</td><td>13</td></tr><tr><td>1617</td><td>Sheppard</td><td>Tommy</td><td>Edinburgh East</td><td>0</td></tr><tr><td>1457</td><td>Thomson</td><td>Michelle</td><td>Edinburgh West</td><td>3</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(guest.filter(col(\"address\").like(\"%Edinburgh%\"))\n",
    " .join(booking, (guest(\"id\")===booking(\"guest_id\")), joinType=\"left\")\n",
    " .na.fill(0, Array(\"nights\"))\n",
    " .groupBy(\"id\", \"last_name\", \"first_name\", \"address\")\n",
    " .agg(sum(\"nights\"))\n",
    " .orderBy(\"last_name\", \"first_name\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9.\n",
    "How busy are we? For each day of the week beginning 2016-11-25 show the number of bookings starting that day. Be sure to show all the days of the week in the correct order.\n",
    "\n",
    "```\n",
    "+------------+----------+\n",
    "| i          | arrivals |\n",
    "+------------+----------+\n",
    "| 2016-11-25 |        7 |\n",
    "| 2016-11-26 |        8 |\n",
    "| 2016-11-27 |       12 |\n",
    "| 2016-11-28 |        7 |\n",
    "| 2016-11-29 |       13 |\n",
    "| 2016-11-30 |        6 |\n",
    "| 2016-12-01 |        7 |\n",
    "+------------+----------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>booking_date</th><th>arrivals</th>\n",
       "                </tr>\n",
       "                <tr><td>2016-11-25</td><td>7</td></tr><tr><td>2016-11-26</td><td>8</td></tr><tr><td>2016-11-27</td><td>12</td></tr><tr><td>2016-11-28</td><td>7</td></tr><tr><td>2016-11-29</td><td>13</td></tr><tr><td>2016-11-30</td><td>6</td></tr><tr><td>2016-12-01</td><td>7</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(booking.filter((col(\"booking_date\") >= \"2016-11-25\") && \n",
    "                (col(\"booking_date\") <= date_add(lit(\"2016-11-25\"), lit(6))))\n",
    " .groupBy(\"booking_date\")\n",
    " .agg(count(\"booking_id\").alias(\"arrivals\"))\n",
    " .orderBy(\"booking_date\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10.\n",
    "How many guests? Show the number of guests in the hotel on the night of 2016-11-21. Include all occupants who checked in that day but not those who checked out.\n",
    "\n",
    "```\n",
    "+----------------+\n",
    "| SUM(occupants) |\n",
    "+----------------+\n",
    "|             39 |\n",
    "+----------------+\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>SUM(occupants</th>\n",
       "                </tr>\n",
       "                <tr><td>39</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(booking.filter((col(\"booking_date\") <= \"2016-11-21\") &&\n",
    "                (date_add(col(\"booking_date\"), col(\"nights\")) > \"2016-11-21\"))\n",
    " .groupBy()\n",
    " .agg(sum(\"occupants\").alias(\"SUM(occupants\"))\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scala",
   "language": "scala",
   "name": "scala"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".sc",
   "mimetype": "text/x-scala",
   "name": "scala",
   "nbconvert_exporter": "script",
   "version": "2.12.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
